#!/bin/bash
set -x

export VLLM_ENGINE_ITERATION_TIMEOUT_S=36000
export VLLM_RPC_TIMEOUT=36000000
export VLLM_ENFORCE_CUDA_GRAPH=1



vllm serve /home/weights/Qwen/Qwen3-32B --served_model_name "qwen3-32" -tp 4 --max-model-len $[32*1024]  --trust-remote-code --host 0.0.0.0 --port 8000  --max-seq-len-to-capture $[32*1024] --compilation_config='{"level":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
# --enable-reasoning --reasoning-parser qwen3


# vllm serve /home/weights/Qwen/Qwen3-32B  --served_model_name "qwen3-32" -tp 4 --trust-remote-code --host 0.0.0.0 --port 8000 --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' --max-model-len 131072  
# --enable-reasoning --reasoning-parser  qwen3
